package com.kdtech.analyse.news;
import com.kdtech.analyse.AnalyseNews;
import com.kdtech.utils.HtmlCleaner;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import com.kdtech.analyse.JSoupUtils;
import com.kdtech.crawler.CrawlHTML;
import com.kdtech.crawler.at.UrlArgumentTop;
import com.kdtech.entity.crawler.UrlMeta;
import com.kdtech.entity.data.NewsMeta;
import com.kdtech.utils.DateUtils;
import com.kdtech.utils.StringUtils;
import com.kdtech.utils.HtmlCleaner;

/**
 * 福音时报   网站解析
 * @author Chase
 *
 */
//http://www.gospeltimes.cn/news/29448/%E9%82%93%E7%B4%AB%E6%A3%8B%E3%80%8A%E6%88%91%E6%98%AF%E6%AD%8C%E6%89%8B2%E3%80%8B%E9%A6%96%E6%AC%A1%E5%A4%BA%E5%86%A0%20%20%E4%B8%8E%E9%BB%84%E7%BB%AE%E7%8F%8A%E4%B8%80%E6%A0%B7%E5%90%8C%E6%98%AF%E7%AC%AC%E4%B8%89%E6%9C%9F
public class GospeltimesNewsAnalyse implements AnalyseNews {

	private static final String[] regex={
		"http://www.gospeltimes.cn/.*/[0-9]*_[0-9]*_[0-9]*/[0-9]*.htm",
		"http://www.gospeltimes.cn/.*.php[?]rid=[0-9]*",
		"http://www.gospeltimes.cn/news/[0-9]*/.*",
		"http://www.gospeltimes.cn/[.a-z0-9\\/]+"
	};


	
	public boolean isDetailPage(String url) {
		for (int i=0; i < regex.length; i++) {
			if (url.matches(regex[i])) {
				return true;
			}
		}
		return false;
	}

	
	public NewsMeta parserHtml(UrlMeta urlMeta) {
		NewsMeta news=new NewsMeta();
		if (urlMeta.getHtml() == null) {
		}
		String url=urlMeta.getUrl();
		/*
		 * 网站源码
		 */
		String htmltxt=urlMeta.getHtml();

		if(!isDetailPage(url)){
		}


		String title=null;
		String content=null;
		String author=null;
		Long date=null;
		Document doc=Jsoup.parse(htmltxt);

		/*
		 * 标题
		 */
		title=doc.select("title").text();
		if (title.contains("-")){
			title=StringUtils.substringBefore(title, "-");
		}
//		title=StringUtils.substringBefore(title, "-福音时报");

		if(StringUtils.isBlank(title)){
		}

		content=HtmlCleaner.getContentHtml(url,doc.select("div.article"));
		content=StringUtils.substringAfter(content, "字体：大 | 中 | 小"); 
		if(StringUtils.isBlank(content)){
			content=HtmlCleaner.getContentHtml(url,doc.select("div.a-style-smallimg"));
		}
		if(StringUtils.isBlank(content))
			content=HtmlCleaner.getContentHtml(url,doc.select("div.details-text"));

		author=JSoupUtils.matchAuthor(doc, "来源:");

		date=DateUtils.matchDate(url);
		if (date == null) {
			date=DateUtils.matchDate(doc.select("div.gray12").text());
			if (date == null) {
				date=DateUtils.matchDate(doc.select(".source-ul").text());
			}
		}
		news.setUrl(url);
		news.setTitle(title);
		news.setContent(content);
		news.setAuthor(author);
		news.setDate(date);
		return news;
	}

	public static void main(String[] args) {
		GospeltimesNewsAnalyse analyse=new GospeltimesNewsAnalyse();
		String url="http://www.gospeltimes.cn/index.php/portal/article/index/id/37789";
		url=UrlArgumentTop.FromatUrl(url);
		System.out.println(UrlArgumentTop.FromatUrl(url));
		System.out.println(analyse.isDetailPage(url));
		System.err.println(url);
		UrlMeta meta=CrawlHTML.responseToURL(url);
		System.out.println(analyse.parserHtml(meta));
	}

	
	public NewsMeta Update(NewsMeta meta) {
		return null;
	}


	
	public boolean isNeedUpdate(){
		return false;
	}
}
